{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T15:14:14.105075Z",
     "iopub.status.busy": "2023-09-15T15:14:14.104782Z",
     "iopub.status.idle": "2023-09-15T15:14:14.109061Z",
     "shell.execute_reply": "2023-09-15T15:14:14.108424Z"
    }
   },
   "outputs": [],
   "source": [
    "# Note: this notebook is pretty slow because it takes time to open the .csv.gz files. \n",
    "# they cannot be opened with scan_csv (which would be faster using pl.collect_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T15:14:14.112279Z",
     "iopub.status.busy": "2023-09-15T15:14:14.112018Z",
     "iopub.status.idle": "2023-09-15T15:14:16.386169Z",
     "shell.execute_reply": "2023-09-15T15:14:16.384038Z"
    }
   },
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "import re\n",
    "import gzip\n",
    "import pickle\n",
    "import polars as pl\n",
    "\n",
    "sys.path.append(\"../\")\n",
    "import tools\n",
    "## CHANGE\n",
    "root = '' # raw data\n",
    "path_data = ''  # analysis data\n",
    "path_output = '' # curated data\n",
    "\n",
    "START_OVER = True  # if True, will reconstruct data from scratch instead of just adding most recent files\n",
    "\n",
    "pl.Config.set_fmt_str_lengths(100);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T15:14:16.392087Z",
     "iopub.status.busy": "2023-09-15T15:14:16.391253Z",
     "iopub.status.idle": "2023-09-15T15:14:17.314298Z",
     "shell.execute_reply": "2023-09-15T15:14:17.313200Z"
    }
   },
   "outputs": [],
   "source": [
    "# reconstruct data from scratch if START_OVER is True\n",
    "\n",
    "def remove_file(filepath):\n",
    "    try: \n",
    "        os.remove(filepath)\n",
    "    except FileNotFoundError:\n",
    "        pass\n",
    "\n",
    "if START_OVER:\n",
    "    remove_file(path_data + 'filenames_advan.pkl')\n",
    "    remove_file(path_output + 'advan_companies.parquet')\n",
    "\n",
    "# check for new files to add\n",
    "\n",
    "all_files = [\n",
    "    os.path.join(path, name)\n",
    "    for path, subdirs, files in os.walk(root)\n",
    "    for name in files\n",
    "]\n",
    "\n",
    "# open list of files already processed\n",
    "try:\n",
    "    filenames_advan = tools.open_pickle(path_data + 'filenames_advan.pkl')\n",
    "except FileNotFoundError:\n",
    "    filenames_advan = []\n",
    "\n",
    "# open dataframe of files already processed\n",
    "try:\n",
    "    df = pl.read_parquet(path_output + 'advan_companies.parquet')\n",
    "except FileNotFoundError:\n",
    "    df = pl.DataFrame(\n",
    "        schema = {\n",
    "            'company_name': pl.Utf8,\n",
    "            'city': pl.Utf8,\n",
    "            'province': pl.Utf8,\n",
    "            'naics': pl.Int64\n",
    "        }\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T15:14:17.318670Z",
     "iopub.status.busy": "2023-09-15T15:14:17.318292Z",
     "iopub.status.idle": "2023-09-15T15:37:53.920383Z",
     "shell.execute_reply": "2023-09-15T15:37:53.919105Z"
    }
   },
   "outputs": [],
   "source": [
    "# update and clean data\n",
    "\n",
    "def read_csv_rename(file):\n",
    "    return (\n",
    "        pl.read_csv(\n",
    "            file,\n",
    "            columns=['location_name', 'city', 'region', 'naics_code']\n",
    "            )\n",
    "        .rename(\n",
    "            {\n",
    "                'location_name': 'company_name',\n",
    "                'region': 'province',\n",
    "                'naics_code': 'naics'\n",
    "            }\n",
    "        )\n",
    "        .select(df.columns)\n",
    "    )\n",
    "\n",
    "files_to_add = list(set(all_files).difference(filenames_advan))\n",
    "\n",
    "for i, filename in enumerate(files_to_add):\n",
    "    if i % 10 == 0:\n",
    "        print(i, filename)\n",
    "    df_new = read_csv_rename(filename)\n",
    "    df = pl.concat([df, df_new], how='vertical').unique()\n",
    "\n",
    "df = tools.clean_city_name(df)\n",
    "df = tools.clean_company_name(df)\n",
    "df = (df\n",
    "    .with_columns(pl.col('province').str.to_lowercase())\n",
    "    .unique()\n",
    ")\n",
    "\n",
    "# save dataframe\n",
    "df.write_parquet(path_output + 'advan_companies.parquet')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T15:37:53.925107Z",
     "iopub.status.busy": "2023-09-15T15:37:53.924795Z",
     "iopub.status.idle": "2023-09-15T15:37:53.937150Z",
     "shell.execute_reply": "2023-09-15T15:37:53.936514Z"
    }
   },
   "outputs": [],
   "source": [
    "# update filenames_advan and save\n",
    "\n",
    "filenames_advan += files_to_add\n",
    "filenames_advan = sorted(list(set(filenames_advan)))  # make sure to avoid duplicates\n",
    "tools.write_pickle(filenames_advan, path_data + 'filenames_advan.pkl')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.12 ('env_indeed2')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "vscode": {
   "interpreter": {
    "hash": "7120590dfa35e6512fb14e5e70b67446c3e78c7a5c027e908dbb14d6a3f8a0eb"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
